import openpyxl
import re
import jieba

# 打开xlsx文件
workbook = openpyxl.load_workbook('cizhui.xlsx')

# 选择活动工作表
sheet = workbook.active

# # 读取单元格的值
# cell_value = sheet['A1'].value
# print(cell_value)

# # 读取整行和整列的值
# row_values = [cell.value for cell in sheet[1]]  # 第一行的值
# print(row_values)
#
# column_values = [sheet[f'A{i}'].value for i in range(1, sheet.max_row + 1)]  # 第一列的值
# print(column_values)

# 遍历所有的行和列
all_content = []


def replace_numbers(match):
    # print(match.group())
    return "number"


def change(content):
    pattern = r'\(\d+-\d+\)'
    new_content = re.sub(pattern, replace_numbers, content)
    pattern = r'(\(\d+(\.\d+)? – \d+(\.\d+)?\))'
    new_content = re.sub(pattern, replace_numbers, new_content)
    pattern = r'(\(\d+(\.\d+)?–\d+(\.\d+)?\))'
    new_content = re.sub(pattern, replace_numbers, new_content)
    pattern = r'\d+(\.\d+)?'
    new_content = re.sub(pattern, replace_numbers, new_content)
    pattern = r'\d+'
    new_content = re.sub(pattern, replace_numbers, new_content)
    return new_content


for row in sheet.iter_rows():
    if type(row[0]) == openpyxl.cell.cell.Cell:
        all_content.append([cell.value for cell in row])
    else:
        all_content[-1][2] = all_content[-1][2] + " " + row[2].value
type1_list = []
type2_list = []
type3_list = []
contents = []

for row in all_content:
    assert len(row) == 4
    type1_list.append(row[0])
    type2_list.append(row[1])
    if row[3]:
        for i in row[3].split(","):
            if row[3]:
                type3_list += [i.strip() for i in row[3].split(",")]
    type3_list.append("None")
    contents.append(change(row[2]))

# 获得词缀类型、名称类型、属性类型
affix_list = list(set(type1_list))
name_list = list(set(type2_list))
property_list = list(set(type3_list))
contents = list(set(contents))
sentences = []
# 词缀信息列表
for text in contents:
    seg_list = jieba.cut(text, cut_all=False)
    # 增加一个结束符
    words = [i for i in list(seg_list) if i != " "] + ["<fin>"]
    sentences.append(words)

# 词缀信息分词后结果
# print(len(sentences))
